/**
 * Copyright (C), 2017-2018, XXX有限公司
 * FileName: WidthFirstSpider
 * Author:   zengjian
 * Date:     2018/7/27 17:51
 * Description: 爬虫公共属性
 * History:
 * <author>          <time>          <version>          <desc>
 * 作者姓名           修改时间           版本号              描述
 */
package third.spider;

import java.util.Collection;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import redis.clients.jedis.JedisPool;
import third.redis.RedisClient;
import third.spider.downloader.SimpleDownloader;
import third.spider.parser.ATagParser;

/**
 * 〈宽度优先爬虫〉<br>
 * 〈一句话描述〉
 *
 * @author zengjian
 * @create 2018/7/27 17:51
 */
public class WidthFirstSpider implements Spider {

    private static final Logger LOGGER = LoggerFactory.getLogger(WidthFirstSpider.class);

    public static final String TARGET_URL = "TARGET_URL";
    public static final String TODO_URL = "TODO_URL";

    /**
     * 下载器
     */
    private SimpleDownloader downLoader;


    /**
     * redis缓存
     */
    private RedisClient redisClient;

    /**
     * 需要匹配的规则
     */
    private Rule rule;

    /**
     * 爬取等待时间
     */
    private long waittime = 1000L;

    private AtomicLong errorCount = new AtomicLong(0);

    private AtomicLong successCount = new AtomicLong(0);

    private AtomicLong discardCount = new AtomicLong(0);

    private AtomicBoolean init = new AtomicBoolean(false);


    public void doCrawlerUrls(String rootUrl) throws DownLoaderException, InterruptedException {
        if (init.compareAndSet(false, true)) {
            init();
        }
        redisClient.sadd(TODO_URL, rootUrl);
        if (rule.match(rootUrl)) {
            redisClient.sadd(TARGET_URL, rootUrl);
        }

        for (String todoUrl = redisClient.spop(TODO_URL); todoUrl != null && rule.match(todoUrl); todoUrl = redisClient.spop(TODO_URL)) {
            String content = null;
            try {
                LOGGER.info("爬取Url:{}" + todoUrl);
                content = downLoader.downloadHtml(todoUrl);
            } catch (DownLoaderException e) {
                LOGGER.error("{}下载异常，错误次数{}", todoUrl, errorCount.incrementAndGet(), e);
            }
            Collection<String> urls = ATagParser.parseUrl(content, "http://www.cnblogs.com/");
            for (String url : urls) {
                if (rule.match(url)) {
                    redisClient.sadd(TARGET_URL, url);
                    LOGGER.info("目标url:{}已存储", url);
                    successCount.incrementAndGet();
                } else {
                    errorCount.incrementAndGet();
                    LOGGER.info("url:{}已废弃", url);
                }
            }
            Thread.sleep(waittime);
            if (urls.size() != 0) {
                redisClient.sadd(TODO_URL, urls.toArray(new String[urls.size()]));
                LOGGER.info("TODOUrl:{}已存储", urls.size());
            }
        }
        LOGGER.info("爬取结束:爬取成功{},不匹配{},异常{}", successCount.get(), discardCount.get(), errorCount.get());
    }


    private void init() {
        if (downLoader == null) {
            downLoader = new SimpleDownloader("10.37.235.10", 8080);
        }
        if (redisClient == null) {
            redisClient = new RedisClient(new JedisPool());
        }
        if (rule == null) {
            rule = new Rule() {
                @Override
                public boolean match(Object target) {
                    if (target instanceof String) {
                        String x = (String) target;
                        if (x.contains("cnblogs.com")) {
                            return true;
                        }
                    }
                    return false;
                }
            };
        }
    }

    public void doCrawler() throws DownLoaderException {


    }


}